home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Amiga Format CD 24
/
Amiga Format AFCD24 (Feb 1998, Issue 108).iso
/
-in_the_mag-
/
emulation
/
adfs
/
coma.adf
/
BinaryGCD.asm
< prev
next >
Wrap
Assembly Source File
|
1998-01-04
|
6KB
|
209 lines
* 68K implementation of Donald Knuth's binary Greatest Common Divisor algorithm
* The Art of Computer Programming, Volume 2, Seminumerical Algorithms, page 321
* Discovered by J Stein in 1961, published in J Comp Phys 1, page 397-405, 1967
* Written in generic 68000 code and tested on 68060 by Simon N Goodwin 31/12/97
*
* To do: register equates, tests on more CPUs
*
* Register parameters
*
* Input: D0 and D1, POSITIVE non-zero words to be tested (U and V)
* Result: D0, greatest common divisor of the inputs (R)
*
* Variables
*
* K D2 ; Shift count for common powers of 2
* T D3 ; Contender to replace larger of U and V
* S D4 ; Scratch
*
* All values are treated as 32 bit signed long words
*
*******************************************************************************
*
* Test bed - returns number of pairs with common factor >1, should be 40% ish
* If any 'optimisation' changes this number, you must have broken something!
*
GEN2 EQU 1 ; 1 for 2 random calls per GCF, 0 for one
LOOPS EQU 10000
*
Test move.l #LOOPS,d7 ; Loop count
moveq #0,d5 ; Factor count
bra.s loop ; Just in case CNOP doesn't
nop
*
* Generate two random numbers. The generator does produce two at once
* in D0 and D1 so only one call is really needed - but D0 is not very
* random, hence 2 distinct calls for each test are preferable (GEN2).
*
cnop 0,8 ; Align loop start
loop jsr LongRand60
move.l d1,d6
lsr.l #1,d6 ; Keep it positive
beq.s loop ; Avoid zero
* Set GEN2 to 0 to use high and low words of one 64 bit random number
* instead of two 32 bit values generated separately (with GEN2 EQU 1)
ifne GEN2
jsr LongRand60 ; Get another rather random set
lsr.l #1,d1 ; Stay positive, most random word
beq.s loop ; Unlikely but fatal!
move.l d6,d0
else
lsr.l #1,d0 ; Stay positive
beq.s loop ; Unlikely but fatal!
move.l d6,d1
endc
Try1 jsr GCD_68000 ; Do the real work
subq.l #1,d0
beq.s no_divisor
addq.l #1,d5 ; Count one more success
no_divisor subq.l #1,d7 ; Keep trying
bgt.s loop
Done
* moveq #0,d0
move.l d5,d0 ; Return success count
rts
*
* Long word polynomial pseudo-random number generator
* This is optimised for 68060s but runs on others too
*
cnop 0,8 ; Align subroutine start
*
LongRand60 move.l Rand60+4,d0 ; Soep
move.l Rand60,d1 ; Poep
andi.b #14,d0 ; Soep
ori.b #32,d0 ; Poep
move.l d1,d3 ; Soep
move.l d0,d2 ; Poep
add.l d2,d2 ; Soep (bypass in)
addx.l d3,d3 ; Poep only
add.l d2,d0 ; Poep
moveq #16,d4 ; Soep - rotation factor
addx.l d3,d1 ; Poep only
ror.l d4,d3 ; Poep
ror.l d4,d2 ; Soep
move.w d2,d3 ; Poep
clr.w d2 ; Soep
add.l d2,d0 ; Poep
move.l d0,Rand60+4 ; Soep (bypass out)
addx.l d3,d1 ; Poep only
move.l d1,Rand60 ; Poep
rts
*
cnop 0,8 ; Long align data (or better)
Rand60 dc.l 0,0 ; Seed
*
* 68060 time for a million tests with generic GCD code: 10.68 seconds
* RTS at start of GCD_68000 = without generic GCD code: 1.86 seconds
*
* Thus the total loop overhead including random number generation is
* less than 20 per cent - clearly GCD computation time predominates.
* All times are approximate as I'm testing an active (but not busy)
* system without resorting to the use of FORBID/DISABLE pairs, etc.
*
* The number of calls to the random function may be halved by using
* both halves of the 64 bit random number as two 31 bit numbers:
*
* 68060 time for a million tests with generic GCD code: 10.05 seconds
* RTS at start of GCD_68000 = without generic GCD code: 1.15 seconds
*
* Thus the test rig is contributing getting on for as much time as
* one pass through the randomising function (which seems to be taking
* around two thirds of a second or perhaps 30 T states, suggesting that
* the subroutine call overhead may be the next thing usefully trimmed.
* Just adding some NOPs saved three to four per cent of total run time!
*
* 68060 GEN2 set, 1 million tests with generic GCD code: 10.28 seconds
* 68060 GEN2 unset, million tests with generic GCD code: 9.74 seconds
*
*******************************************************************************
*
* Generic 68000 version - first and simplest version
*
* This is an attempt at a straight-forward solution of the problem,
* but a few 68000-specific performance peculiarities are indulged.
*
* Three days trying to produce a faster version for the 68060 failed,
* including BTST #0s and a restoring version of the B3 loop, so this
* is the definitive 68K version, unless I think of something better!
*
* IF ANY INPUT IS NEGATIVE OR ZERO THIS ALGORITHM MAY NOT TERMINATE!
*
cnop 0,8 ; Align branch target
*
GCD_68000
*
* First stage, count and discard low-order zero bits in both operands
*
B1 moveq #0,d2 ; Initial shift count
S1 move.l d0,d4 ; Combine operands U an V
or.l d1,d4
and.w #1,d4 ; Test low order bit
; (.b is no faster and .l is longer)
bne.s B2 ; Exit unless both U and V are even
asr.l #1,d0 ; Shift out zero bit
asr.l #1,d1 ; Do the same for both operands
addq.l #1,d2 ; Count one power of two in factor
bra.s S1 ; Try again
*
* Second stage, initialise T to (U & 1) ? -V : U
*
B2 moveq #1,d4 ; Mask for least significant bit
and.w d0,d4 ; Test bit 0 of U (BTST #0 may be slower)
beq.s U_even ; Continue only if U is odd
move.l d1,d3 ; Set T to V
neg.l d3 ; Set T to -V
bra.s B4 ; Some CPUs favour TRAPF.L ($51FB)
U_even move.l d0,d3 ; Set T to U
*
* Third stage, T is now non-zero and even. Halve it.
*
B3 asr.l #1,d3
*
* Fourth stage, See if T is still even
*
B4 moveq #1,d4 ; Bit 0 mask
and.w d3,d4 ; Test low order bit (word, see above)
beq.s B3 ; Iterate until T becomes odd
*
* Fifth stage, Reset MAX(U,V) so larger of U and V is replaced by ABS(T)
* T > 0 ? U := T : V := -T
*
B5 move.l d3,d4 ; Copy T to scratch, implicitly testing it
bgt.s Replace_U
neg.l d4 ; Generate -T
move.l d4,d1 ; Set V to -T
move.l d0,d3 ; Set T to U in preparation for stage 6
Replace_U move.l d3,d0 ; Set U to T - not worth skipping!
*
* Sixth stage, Set T to U-V and iterate till T=0
*
B6 sub.l d1,d3 ; Set T to U-V
bne.s B3 ; Iterate while T<>0
*
* Epilogue, result is U * 2 ^ K
*
lsl.l d2,d0 ; Scale the result in D0
rts
*
*******************************************************************************
*
* dc.w $51FA ; Test TRAPF.W, skip word; $51FB skips long
end